{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.2 本小节学习使用情感词典辅助情感分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import opinion_lexicon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "positive: 2006\n",
      "negative: 4783\n"
     ]
    }
   ],
   "source": [
    "# 导入情感词典\n",
    "positive_words = set(opinion_lexicon.positive())\n",
    "negative_words = set(opinion_lexicon.negative())\n",
    "\n",
    "print('positive:', len(positive_words))\n",
    "print('negative:', len(negative_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train: 1600\n",
      "test: 400\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import movie_reviews\n",
    "import random\n",
    "random.seed(42)\n",
    "\n",
    "# 读取和划分数据集\n",
    "def load_movie_reviews():\n",
    "    pos_ids = movie_reviews.fileids('pos')\n",
    "    neg_ids = movie_reviews.fileids('neg')\n",
    "\n",
    "    all_reviews = []\n",
    "    for pids in pos_ids:\n",
    "        all_reviews.append((movie_reviews.raw(pids), 'positive'))\n",
    "    \n",
    "    for nids in neg_ids:\n",
    "        all_reviews.append((movie_reviews.raw(nids), 'negative'))\n",
    "\n",
    "    random.shuffle(all_reviews)\n",
    "    train_reviews = all_reviews[:1600]\n",
    "    test_reviews = all_reviews[1600:]\n",
    "\n",
    "    return train_reviews, test_reviews\n",
    "\n",
    "train_reviews, test_reviews = load_movie_reviews()\n",
    "print('train:', len(train_reviews))\n",
    "print('test:', len(test_reviews))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import NaiveBayesClassifier\n",
    "\n",
    "\n",
    "def train_and_test(extract_feature, train_data, test_data):\n",
    "    training_set = nltk.classify.apply_features(extract_feature, train_data)\n",
    "    test_set = nltk.classify.apply_features(extract_feature, test_data)\n",
    "\n",
    "    classifier = NaiveBayesClassifier.train(training_set)\n",
    "    accuracy = nltk.classify.util.accuracy(classifier, test_set)\n",
    "    print(f'accuracy is {accuracy:.4f}')\n",
    "\n",
    "    return classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 只把情感词当作特征进行提取\n",
    "from nltk import word_tokenize\n",
    "import nltk\n",
    "def extract_feature(text):\n",
    "    feature = {}\n",
    "    text = text.lower()\n",
    "    for word in word_tokenize(text):\n",
    "        if word in positive_words or word in negative_words:\n",
    "            feature[f'contain: {word}'] = True\n",
    "    return feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy is 0.8125\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<nltk.classify.naivebayes.NaiveBayesClassifier at 0x157d7159c70>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_and_test(extract_feature, train_reviews, test_reviews)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 仅根据情感词的数量进行情感判断（最简单的分类器）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def count_based_classifier(text):\n",
    "    pos_num = 0\n",
    "    neg_num = 0\n",
    "    for word in word_tokenize(text):\n",
    "        if word in positive_words:\n",
    "            pos_num += 1\n",
    "        if word in negative_words:\n",
    "            neg_num += 1\n",
    "\n",
    "    if pos_num >= neg_num:\n",
    "        return 'positive'\n",
    "    else:\n",
    "        return 'negative'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accracy is 0.6600\n"
     ]
    }
   ],
   "source": [
    "correct_num = 0\n",
    "all_num = 0\n",
    "\n",
    "for review, polarity in test_reviews:\n",
    "    predicted = count_based_classifier(review)\n",
    "    if predicted == polarity:\n",
    "        correct_num += 1\n",
    "    all_num += 1 \n",
    "\n",
    "print(f'accracy is {correct_num / all_num:.4f}')"
   ]
  },
  {
   "source": [
    "更重要的问题\n",
    "\n",
    "- 如何得到情感词典\n",
    "    - 从语料库中挖掘情感词典 / 自定义种子词\n",
    "    - 使用半监督的方法扩展情感词典\n"
   ],
   "cell_type": "markdown",
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}